Content
A. Identified gene number
B. Identified genes’ biotype
C. Identified transcript number
D. Identified transcripts’ biotype
E. Correlation of different method quantification
F. GO&KEGG enrichment of method-specific genes
Biotype of Library-specific isoforms
Biotype of Library-specific genes
GO&KEGG of Library-specific genes
preparation
suppressPackageStartupMessages(library(data.table))
gtf <- rtracklayer::readGFF("/mnt/raid61/Personal_data/tangchao/Document/gencode/human/GRCh37/gencode.v30lift37.annotation.gtf")
gtf <- rtracklayer::readGFF("/mnt/raid61/Personal_data/tangchao/Document/gencode/human/GRCh37/gencode.v30lift37.annotation.gtf")
setDT(gtf)
gtf <- gtf[type %in% c("gene", "transcript"), .(seqid, type, start, end, strand, gene_id, gene_type, gene_name, transcript_id, transcript_type)]
gtf[, gene_id:=substr(gene_id, 1, 15)]
gtf[, transcript_id:=substr(transcript_id, 1, 15)]
Gene_biotype <- fread("/mnt/raid61/Personal_data/tangchao/ScientificData/data/Annotation/Gene_biotype.csv", header = F, select = 1:2)
setkey(Gene_biotype, V1)
gtf_gene <- gtf[type == "gene", ]
gtf_gene$gene_biotype <- Gene_biotype[gtf_gene$gene_type, V2]
gtf_gene <- unique(gtf_gene[, .(gene_id, gene_biotype)])
setkey(gtf_gene, gene_id)
gtf_tx <- gtf[type == "transcript", ]
gtf_tx$transcript_biotype <- Gene_biotype[gtf_tx$transcript_type, V2]
gtf_tx <- unique(gtf_tx[, .(transcript_id, transcript_biotype)])
setkey(gtf_tx, transcript_id)
# col.p <- lattice::trellis.par.get("superpose.symbol")$col[1] # colour of PolyA RNA
# col.t <- lattice::trellis.par.get("superpose.symbol")$col[2] # colour of Total RNA
col.p <- "#00AFBB"
col.t <- "#E7B800"
SampInfo_PolyA <- fread("/mnt/raid61/Personal_data/tangchao/ScientificData/data/SampleInfo/PolyA_RNA_sampleInfo.txt")
SampInfo_Total <- fread("/mnt/raid61/Personal_data/tangchao/ScientificData/data/SampleInfo/Total_RNA_sampleInfo.txt")
A. Identified gene number
PolyA
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_PolyA <- paste(SampInfo_PolyA$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_PolyA)))
Gene_PolyA <- lapply(files_PolyA, fread, select = c(1, 5, 6))
filtering
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
PolyA_gene <- unique(substr(do.call(rbind, Gene_PolyA)[[1]], 1, 15))
Total
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_Total <- paste(SampInfo_Total$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_Total)))
Gene_Total <- lapply(files_Total, fread, select = c(1, 5, 6))
filtering
Gene_Total <- lapply(Gene_Total, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_Total <- lapply(Gene_Total, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
Total_gene <- unique(substr(do.call(rbind, Gene_Total)[[1]], 1, 15))
Mat <- rbind(data.frame(Gene = mapply(nrow, Gene_PolyA), Library = "PolyA"), data.frame(Gene = mapply(nrow, Gene_Total), Library = "Total"))
library(ggpubr)
my_comparisons = list(c("PolyA", "Total"))
ggviolin(Mat, x = "Library", y = "Gene", fill = "Library",
palette = c(col.p, col.t),
add = "boxplot",
add.params = list(fill = "white"))+
stat_compare_means(comparisons = my_comparisons, label = "p.signif", method = "t.test")+ # Add significance levels
stat_compare_means(method = "t.test", label.x.npc = "center")+
guides(fill = FALSE)+
theme(axis.title = element_text(size = 16))+
labs(y = "No. genes")
ggviolin(Mat, x = "Library", y = "Gene", fill = "Library",
palette = c(col.p, col.t),
add = "boxplot",
add.params = list(fill = "white"))+
stat_compare_means(label = "p.format", comparisons = my_comparisons, method = "t.test")+ # Add significance levelsaes(label = paste0("p = ", ..p.format..))
guides(fill = FALSE)+
theme(axis.title = element_text(size = 16))+
labs(y = "No. genes")
input = list(PolyA = PolyA_gene, Total = Total_gene)
lab1 <- paste(sum(!input[[1]] %in% input[[2]]), "\n(", round(mean(!input[[1]] %in% input[[2]]) * 100, 2), "%)", sep = "")
lab2 <- length(intersect(input[[1]], input[[2]]))
lab3 <- paste(sum(!input[[2]] %in% input[[1]]), "\n(", round(mean(!input[[2]] %in% input[[1]]) * 100, 2), "%)", sep = "")
library(ggplot2)
library(ggforce)
library(cowplot)
ggplot() + geom_circle(aes(x0 = c(-1, 1),
y0 = c(0, 0),
r = c(2, 2),
color = c(col.p, col.t),
fill = c(col.p, col.t)),
lwd = 1.5,
alpha = .1)+
guides(color = F, fill = F, alpha = F)+
scale_fill_manual(values = c(col.p, col.t)) +
scale_color_manual(values = c(col.p, col.t)) +
theme_nothing() +
ggplot2::annotate("text", x = c(-2, 0, 2), y = 0, label = c(lab1, lab2, lab3), size = 4) +
ggplot2::annotate("text", x = c(-1, 1), y = 2.2, label = names(input), size = 6)
B. Identified genes’ biotype
Total_gene_biotype <- mapply(function(x) table(gtf_gene[x[[1]],]$gene_biotype), Gene_Total)
PolyA_gene_biotype <- mapply(function(x) table(gtf_gene[x[[1]],]$gene_biotype), Gene_PolyA)
Total_gene_biotype <- data.frame(reshape2::melt(Total_gene_biotype))
PolyA_gene_biotype <- data.frame(reshape2::melt(PolyA_gene_biotype))
Total_gene_biotype$Library <- "Total"
PolyA_gene_biotype$Library <- "PolyA"
gene_biotype <- rbind(PolyA_gene_biotype, Total_gene_biotype)
data_summary <- function(data, varname, groupnames){
require(plyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum<-ddply(data, groupnames, .fun=summary_func, varname)
data_sum <- rename(data_sum, c("mean" = varname))
return(data_sum)
}
df2 <- data_summary(gene_biotype, varname="value",
groupnames=c("Library", "Var1"))
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
##
## mutate
ggplot(df2, aes(x=Var1, y=value, fill=Library)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
position=position_dodge(.9)) +
labs(x = "Biotype", y = "No. gene")+
theme_classic() +
scale_fill_manual(values=c(col.p, col.t))+
guides(fill = FALSE)+
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 22, hjust = 1)) -> p
tmp <- with(gene_biotype, by(gene_biotype, Var1, function(x) t.test(value ~ Library, data = x)))
(lab <- signif(sapply(tmp, function(x) x$p.value), 2))
## Antisense lncRNA ncRNA Protein coding Pseudogene
## 8.1e-27 5.8e-06 3.4e-29 1.6e-12 5.1e-01
y0 = as.vector(by(df2$value + df2$sd, factor(df2$Var1), max)) +
max(as.vector(by(df2$value + df2$sd, factor(df2$Var1), max))) * .03
p + ggplot2::annotate("text", x = 1:5,
y = y0,
label = lab)
plevel <- function(x) {
if(x < 1e-04) {
"****"
} else {
if(x < 0.001) {
"***"
} else {
if(x < 0.01) {
"**"
} else {
if(x < 0.05) {
"*"
} else {
"ns"
}
}
}
}
}
sapply(lab, plevel)
## Antisense lncRNA ncRNA Protein coding Pseudogene
## "****" "****" "****" "****" "ns"
p + ggplot2::annotate("text", x = 1:5,
y = y0,
label = sapply(lab, plevel))
Percentage
Total_gene_biotype <- mapply(function(x) table(gtf_gene[x[[1]],]$gene_biotype), Gene_Total)
PolyA_gene_biotype <- mapply(function(x) table(gtf_gene[x[[1]],]$gene_biotype), Gene_PolyA)
Total_gene_biotype <- apply(Total_gene_biotype, 2, function(x) x/sum(x)*100)
PolyA_gene_biotype <- apply(PolyA_gene_biotype, 2, function(x) x/sum(x)*100)
Total_gene_biotype <- data.frame(reshape2::melt(Total_gene_biotype))
PolyA_gene_biotype <- data.frame(reshape2::melt(PolyA_gene_biotype))
Total_gene_biotype$Library <- "Total"
PolyA_gene_biotype$Library <- "PolyA"
gene_biotype <- rbind(PolyA_gene_biotype, Total_gene_biotype)
df2 <- data_summary(gene_biotype, varname="value",
groupnames=c("Library", "Var1"))
ggplot(df2, aes(x=Var1, y=value, fill=Library)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
position=position_dodge(.9)) +
labs(x = "Biotype", y = "Percentage of gene")+
theme_classic() +
scale_fill_manual(values=c(col.p, col.t))+
guides(fill = FALSE)+
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 22, hjust = 1)) -> p
tmp <- with(gene_biotype, by(gene_biotype, Var1, function(x) t.test(value ~ Library, data = x)))
(lab <- signif(sapply(tmp, function(x) x$p.value), 2))
## Antisense lncRNA ncRNA Protein coding Pseudogene
## 6.6e-41 2.1e-15 8.2e-43 4.1e-01 6.1e-04
y0 = as.vector(by(df2$value + df2$sd, factor(df2$Var1), max)) +
max(as.vector(by(df2$value + df2$sd, factor(df2$Var1), max))) * .03
p + ggplot2::annotate("text", x = 1:5,
y = y0,
label = lab)
sapply(lab, plevel)
## Antisense lncRNA ncRNA Protein coding Pseudogene
## "****" "****" "****" "ns" "***"
p + ggplot2::annotate("text", x = 1:5,
y = y0,
label = sapply(lab, plevel))
C. Identified transcript number
PolyA
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_PolyA <- paste(SampInfo_PolyA$ID , ".isoforms.results", sep = "")
stopifnot(all(file.exists(files_PolyA)))
Gene_PolyA <- lapply(files_PolyA, fread, select = c(1, 5, 6))
filtering
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[, transcript_id:=substr(transcript_id, 1, 15)]})
PolyA_gene <- unique(substr(do.call(rbind, Gene_PolyA)[[1]], 1, 15))
Total
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_Total <- paste(SampInfo_Total$ID , ".isoforms.results", sep = "")
stopifnot(all(file.exists(files_Total)))
Gene_Total <- lapply(files_Total, fread, select = c(1, 5, 6))
filtering
Gene_Total <- lapply(Gene_Total, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_Total <- lapply(Gene_Total, function(x) {x[, transcript_id:=substr(transcript_id, 1, 15)]})
Total_gene <- unique(substr(do.call(rbind, Gene_Total)[[1]], 1, 15))
Mat <- rbind(data.frame(Gene = mapply(nrow, Gene_PolyA), Library = "PolyA"), data.frame(Gene = mapply(nrow, Gene_Total), Library = "Total"))
library(ggpubr)
my_comparisons = list(c("PolyA", "Total"))
ggviolin(Mat, x = "Library", y = "Gene", fill = "Library",
palette = c(col.p, col.t),
add = "boxplot",
add.params = list(fill = "white"))+
stat_compare_means(comparisons = my_comparisons, label = "p.signif", method = "t.test")+ # Add significance levels
stat_compare_means(method = "t.test", label.x.npc = "center")+
guides(fill = FALSE)+
theme(axis.title = element_text(size = 16))+
labs(y = "No. isoform")
ggviolin(Mat, x = "Library", y = "Gene", fill = "Library",
palette = c(col.p, col.t),
add = "boxplot",
add.params = list(fill = "white"))+
stat_compare_means(label = "p.format", comparisons = my_comparisons, method = "t.test")+ # Add significance levelsaes(label = paste0("p = ", ..p.format..))
guides(fill = FALSE)+
theme(axis.title = element_text(size = 16))+
labs(y = "No. isoform")
input = list(PolyA = PolyA_gene, Total = Total_gene)
lab1 <- paste(sum(!input[[1]] %in% input[[2]]), "\n(", round(mean(!input[[1]] %in% input[[2]]) * 100, 2), "%)", sep = "")
lab2 <- length(intersect(input[[1]], input[[2]]))
lab3 <- paste(sum(!input[[2]] %in% input[[1]]), "\n(", round(mean(!input[[2]] %in% input[[1]]) * 100, 2), "%)", sep = "")
library(ggplot2)
library(ggforce)
library(cowplot)
ggplot() + geom_circle(aes(x0 = c(-1, 1),
y0 = c(0, 0),
r = c(2, 2),
color = c(col.p, col.t),
fill = c(col.p, col.t)),
lwd = 1.5,
alpha = .1)+
guides(color = F, fill = F, alpha = F)+
scale_fill_manual(values = c(col.p, col.t)) +
scale_color_manual(values = c(col.p, col.t)) +
theme_nothing() +
ggplot2::annotate("text", x = c(-2, 0, 2), y = 0, label = c(lab1, lab2, lab3), size = 4) +
ggplot2::annotate("text", x = c(-1, 1), y = 2.2, label = names(input), size = 6)
D. Identified transcripts’ biotype
Total_gene_biotype <- mapply(function(x) table(gtf_tx[x[[1]],]$transcript_biotype), Gene_Total)
PolyA_gene_biotype <- mapply(function(x) table(gtf_tx[x[[1]],]$transcript_biotype), Gene_PolyA)
Total_gene_biotype <- data.frame(reshape2::melt(Total_gene_biotype))
PolyA_gene_biotype <- data.frame(reshape2::melt(PolyA_gene_biotype))
Total_gene_biotype$Library <- "Total"
PolyA_gene_biotype$Library <- "PolyA"
gene_biotype <- rbind(PolyA_gene_biotype, Total_gene_biotype)
df2 <- data_summary(gene_biotype, varname="value",
groupnames=c("Library", "Var1"))
ggplot(df2, aes(x=Var1, y=value, fill=Library)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
position=position_dodge(.9)) +
labs(x = "Biotype", y = "No. isoform")+
theme_classic() +
scale_fill_manual(values=c(col.p, col.t))+
guides(fill = FALSE)+
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 22, hjust = 1)) -> p
tmp <- with(gene_biotype, by(gene_biotype, Var1, function(x) t.test(value ~ Library, data = x)))
(lab <- signif(sapply(tmp, function(x) x$p.value), 2))
## Antisense lncRNA ncRNA Other Protein coding
## 6.6e-17 2.3e-04 5.9e-08 4.8e-04 7.7e-01
## Pseudogene
## 1.9e-01
y0 = as.vector(by(df2$value + df2$sd, factor(df2$Var1), max)) +
max(as.vector(by(df2$value + df2$sd, factor(df2$Var1), max))) * .03
p + ggplot2::annotate("text", x = 1:6,
y = y0,
label = lab)
sapply(lab, plevel)
## Antisense lncRNA ncRNA Other Protein coding
## "****" "***" "****" "***" "ns"
## Pseudogene
## "ns"
p + ggplot2::annotate("text", x = 1:6,
y = y0,
label = sapply(lab, plevel))
Percentage
Total_gene_biotype <- mapply(function(x) table(gtf_tx[x[[1]],]$transcript_biotype), Gene_Total)
PolyA_gene_biotype <- mapply(function(x) table(gtf_tx[x[[1]],]$transcript_biotype), Gene_PolyA)
Total_gene_biotype <- apply(Total_gene_biotype, 2, function(x) x/sum(x)*100)
PolyA_gene_biotype <- apply(PolyA_gene_biotype, 2, function(x) x/sum(x)*100)
Total_gene_biotype <- data.frame(reshape2::melt(Total_gene_biotype))
PolyA_gene_biotype <- data.frame(reshape2::melt(PolyA_gene_biotype))
Total_gene_biotype$Library <- "Total"
PolyA_gene_biotype$Library <- "PolyA"
gene_biotype <- rbind(PolyA_gene_biotype, Total_gene_biotype)
df2 <- data_summary(gene_biotype, varname="value",
groupnames=c("Library", "Var1"))
ggplot(df2, aes(x=Var1, y=value, fill=Library)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
position=position_dodge(.9)) +
labs(x = "Biotype", y = "Percentage of isoform")+
theme_classic() +
scale_fill_manual(values=c(col.p, col.t))+
guides(fill = FALSE)+
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 22, hjust = 1)) -> p
tmp <- with(gene_biotype, by(gene_biotype, Var1, function(x) t.test(value ~ Library, data = x)))
(lab <- signif(sapply(tmp, function(x) x$p.value), 2))
## Antisense lncRNA ncRNA Other Protein coding
## 1.1e-52 7.8e-11 2.9e-21 5.1e-22 1.9e-10
## Pseudogene
## 4.4e-01
y0 = as.vector(by(df2$value + df2$sd, factor(df2$Var1), max)) +
max(as.vector(by(df2$value + df2$sd, factor(df2$Var1), max))) * .03
p + ggplot2::annotate("text", x = 1:6,
y = y0,
label = lab)
sapply(lab, plevel)
## Antisense lncRNA ncRNA Other Protein coding
## "****" "****" "****" "****" "****"
## Pseudogene
## "ns"
p + ggplot2::annotate("text", x = 1:6,
y = y0,
label = sapply(lab, plevel))
E. Correlation of different method quantification
PolyA
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_PolyA <- paste(SampInfo_PolyA$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_PolyA)))
Gene_PolyA <- lapply(files_PolyA, fread, select = c(1, 5, 6))
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
Total
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_Total <- paste(SampInfo_Total$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_Total)))
Gene_Total <- lapply(files_Total, fread, select = c(1, 5, 6))
Gene_Total <- lapply(Gene_Total, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_Total <- lapply(Gene_Total, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
stopifnot(identical(SampInfo_PolyA$donor_id, SampInfo_Total$donor_id))
Gene_TPM <- list()
for(i in 1:40){
tmp <- merge(Gene_PolyA[[i]][, c(1, 3)], Gene_Total[[i]][, c(1, 3)], by = "gene_id")
colnames(tmp)[2:3] <- c("PolyA", "Total")
Gene_TPM[[i]] <- tmp
}
gene_cor <- mapply(Gene_TPM, FUN = function(x) cor(log1p(x[[2]]), log1p(x[[3]])))
ggviolin(gene_cor,
add = "boxplot",
fill = "#FC4E07",
alpha = .5,
add.params = list(fill = "white")) +
ggbeeswarm::geom_quasirandom(size = 2, alpha = .5, col = "#FC4E07") +
labs(y = "r")+
theme_classic() +
scale_fill_manual(values=c(col.p, col.t))+
guides(fill = FALSE)+
theme(axis.title = element_text(size = 22),
axis.text.y = element_text(size = 16),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank())
F. GO&KEGG enrichment of method-specific genes
Biotype of Library-specific isoforms
PolyA_Spe_isoform <- input$PolyA[!input$PolyA %in% input$Total]
Total_Spe_isoform <- input$Total[!input$Total %in% input$PolyA]
table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype)
##
## Antisense lncRNA ncRNA Other Protein coding
## 1286 487 1275 2244 5816
## Pseudogene
## 632
table(gtf_tx[Total_Spe_isoform,]$transcript_biotype)
##
## Antisense lncRNA ncRNA Other Protein coding
## 413 646 2687 2791 3772
## Pseudogene
## 545
table(gtf_tx[Total_Spe_isoform,]$transcript_biotype)/length(Total_Spe_isoform)
##
## Antisense lncRNA ncRNA Other Protein coding
## 0.03805049 0.05951723 0.24755850 0.25714022 0.34752165
## Pseudogene
## 0.05021190
table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype)/length(PolyA_Spe_isoform)
##
## Antisense lncRNA ncRNA Other Protein coding
## 0.10954003 0.04148211 0.10860307 0.19114140 0.49540034
## Pseudogene
## 0.05383305
df <- data.frame(
group = names(table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype)),
value = as.numeric(table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype))
)
df
## group value
## 1 Antisense 1286
## 2 lncRNA 487
## 3 ncRNA 1275
## 4 Other 2244
## 5 Protein coding 5816
## 6 Pseudogene 632
library(ggplot2)
# Barplot
bp<- ggplot(df, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# use brewer color palettes
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
# Use brewer palette
pie +
scale_fill_brewer(palette="Dark2") +
blank_theme +
theme(axis.text.x = element_blank())+
geom_text(data = df, aes(x = 1.3, y = c(0, cumsum(rev(value))[1:5]) + rev(value)/2,
label = rev(paste0(round(value/sum(value)*100, 2), "%"))), size = 3)
library(plotly)
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- plot_ly(df, labels = ~group, values = ~value, type = 'pie') %>%
layout(title = 'PolyA',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
df <- data.frame(
group = names(table(gtf_tx[Total_Spe_isoform,]$transcript_biotype)),
value = as.numeric(table(gtf_tx[Total_Spe_isoform,]$transcript_biotype))
)
df
## group value
## 1 Antisense 413
## 2 lncRNA 646
## 3 ncRNA 2687
## 4 Other 2791
## 5 Protein coding 3772
## 6 Pseudogene 545
# Barplot
bp<- ggplot(df, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# use brewer color palettes
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
# Use brewer palette
pie +
scale_fill_brewer(palette="Dark2") +
blank_theme +
theme(axis.text.x = element_blank())+
geom_text(data = df, aes(x = 1.3, y = c(0, cumsum(rev(value))[1:5]) + rev(value)/2,
label = rev(paste0(round(value/sum(value)*100, 2), "%"))), size = 3)
library(plotly)
p <- plot_ly(df, labels = ~group, values = ~value, type = 'pie') %>%
layout(title = 'Total',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
df_PolyA <- data.frame(
group = names(table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype)),
value = as.numeric(table(gtf_tx[PolyA_Spe_isoform,]$transcript_biotype))/length(PolyA_Spe_isoform),
Library = "PolyA")
df_Total <- data.frame(
group = names(table(gtf_tx[Total_Spe_isoform,]$transcript_biotype)),
value = as.numeric(table(gtf_tx[Total_Spe_isoform,]$transcript_biotype))/length(Total_Spe_isoform),
Library = "Total")
df <- rbind(df_PolyA, df_Total)
ggplot(df, aes(group, value, fill = Library)) +
geom_col(position=position_dodge())+
# coord_flip() +
labs(x = "Biotype", y = "Percentage of isoform", title = "Biotype of Library-specific isoforms")+
theme_classic() +
# scale_y_continuous(breaks = c(-0.4, -0.2, 0.0, 0.2), label = c("0.4", "0.2", "0.0", "0.2"))+
scale_fill_manual(values=c(col.p, col.t))+
# guides(fill = FALSE)+
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 12),
# axis.text.x = element_text(angle = 22, hjust = 1),
legend.position = "top")
Biotype of Library-specific genes
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_PolyA <- paste(SampInfo_PolyA$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_PolyA)))
Gene_PolyA <- lapply(files_PolyA, fread, select = c(1, 5, 6))
# filtering
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_PolyA <- lapply(Gene_PolyA, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
PolyA_gene <- unique(substr(do.call(rbind, Gene_PolyA)[[1]], 1, 15))
setwd("/mnt/raid62/Chen_Cell_2016/RSEM/EGAD00001002671")
files_Total <- paste(SampInfo_Total$ID , ".genes.results", sep = "")
stopifnot(all(file.exists(files_Total)))
Gene_Total <- lapply(files_Total, fread, select = c(1, 5, 6))
# filtering
Gene_Total <- lapply(Gene_Total, function(x) {x[expected_count >= 6 & TPM >= 0.1, ]})
Gene_Total <- lapply(Gene_Total, function(x) {x[, gene_id:=substr(gene_id, 1, 15)]})
Total_gene <- unique(substr(do.call(rbind, Gene_Total)[[1]], 1, 15))
input = list(PolyA = PolyA_gene, Total = Total_gene)
PolyA_Spe_gene <- input$PolyA[!input$PolyA %in% input$Total]
Total_Spe_gene <- input$Total[!input$Total %in% input$PolyA]
table(gtf_gene[PolyA_Spe_gene,]$gene_biotype)
##
## Antisense lncRNA ncRNA Protein coding Pseudogene
## 924 275 63 1185 644
table(gtf_gene[Total_Spe_gene,]$gene_biotype)
##
## Antisense lncRNA ncRNA Protein coding Pseudogene
## 150 432 243 327 535
table(gtf_gene[PolyA_Spe_gene,]$gene_biotype)/length(PolyA_Spe_gene)
##
## Antisense lncRNA ncRNA Protein coding Pseudogene
## 0.29893238 0.08896797 0.02038175 0.38337108 0.20834681
table(gtf_gene[Total_Spe_gene,]$gene_biotype)/length(Total_Spe_gene)
##
## Antisense lncRNA ncRNA Protein coding Pseudogene
## 0.08891523 0.25607587 0.14404268 0.19383521 0.31713100
df <- data.frame(
group = names(table(gtf_gene[PolyA_Spe_gene,]$gene_biotype)),
value = as.numeric(table(gtf_gene[PolyA_Spe_gene,]$gene_biotype))
)
df
## group value
## 1 Antisense 924
## 2 lncRNA 275
## 3 ncRNA 63
## 4 Protein coding 1185
## 5 Pseudogene 644
library(ggplot2)
# Barplot
bp<- ggplot(df, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# use brewer color palettes
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
# Use brewer palette
pie +
scale_fill_brewer(palette="Dark2") +
blank_theme +
theme(axis.text.x = element_blank())+
geom_text(data = df, aes(x = 1.3, y = c(0, cumsum(rev(value))[1:4]) + rev(value)/2,
label = rev(paste0(round(value/sum(value)*100, 2), "%"))), size = 3)
library(plotly)
p <- plot_ly(df, labels = ~group, values = ~value, type = 'pie') %>%
layout(title = 'PolyA',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
df <- data.frame(
group = names(table(gtf_gene[Total_Spe_gene,]$gene_biotype)),
value = as.numeric(table(gtf_gene[Total_Spe_gene,]$gene_biotype))
)
df
## group value
## 1 Antisense 150
## 2 lncRNA 432
## 3 ncRNA 243
## 4 Protein coding 327
## 5 Pseudogene 535
# Barplot
bp<- ggplot(df, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# use brewer color palettes
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
# Use brewer palette
pie +
scale_fill_brewer(palette="Dark2") +
blank_theme +
theme(axis.text.x = element_blank())+
geom_text(data = df, aes(x = 1.3, y = c(0, cumsum(rev(value))[1:4]) + rev(value)/2,
label = rev(paste0(round(value/sum(value)*100, 2), "%"))), size = 3)
library(plotly)
p <- plot_ly(df, labels = ~group, values = ~value, type = 'pie') %>%
layout(title = 'Total',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
df_PolyA <- data.frame(
group = names(table(gtf_gene[PolyA_Spe_gene,]$gene_biotype)),
value = -as.numeric(table(gtf_gene[PolyA_Spe_gene,]$gene_biotype))/length(PolyA_Spe_gene),
Library = "PolyA")
df_Total <- data.frame(
group = names(table(gtf_gene[Total_Spe_gene,]$gene_biotype)),
value = as.numeric(table(gtf_gene[Total_Spe_gene,]$gene_biotype))/length(Total_Spe_gene),
Library = "Total")
df <- rbind(df_PolyA, df_Total)
ggplot(df, aes(group, value, fill = Library)) +
geom_col()+
coord_flip() +
labs(x = "Biotype", y = "Percentage of gene", title = "Biotype of Library-specific genes")+
theme_classic() +
scale_y_continuous(breaks = c(-0.4, -0.2, 0.0, 0.2), label = c("0.4", "0.2", "0.0", "0.2"))+
scale_fill_manual(values=c(col.p, col.t))+
# guides(fill = FALSE)+
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 12),
# axis.text.x = element_text(angle = 22, hjust = 1),
legend.position = "top")
GO of Library-specific genes
suppressPackageStartupMessages(library(clusterProfiler))
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unsplit, which,
## which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plotly':
##
## rename
## The following object is masked from 'package:plyr':
##
## rename
## The following objects are masked from 'package:data.table':
##
## first, second
## The following object is masked from 'package:base':
##
## expand.grid
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:plotly':
##
## slice
## The following object is masked from 'package:plyr':
##
## desc
## The following object is masked from 'package:data.table':
##
## shift
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:plotly':
##
## select
##
PolyA_Spe_gene_ego <- enrichGO(gene = PolyA_Spe_gene,
# universe = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
Total_Spe_gene_ego <- enrichGO(gene = Total_Spe_gene,
# universe = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
dotplot(PolyA_Spe_gene_ego)
## wrong orderBy parameter; set to default `orderBy = "x"`
dotplot(Total_Spe_gene_ego)
## wrong orderBy parameter; set to default `orderBy = "x"`
gene_list = list(PolyA = as.character(bitr(PolyA_Spe_gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID),
Total = as.character(bitr(Total_Spe_gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID))
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
ck <- compareCluster(gene_list, fun = "enrichKEGG", organism="hsa")
dotplot(ck)
gene <- as.character(bitr(PolyA_Spe_gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID)
## 'select()' returned 1:many mapping between keys and columns
PolyA_Spe_gene_kk <- enrichKEGG(gene = gene,
organism = 'hsa',
pvalueCutoff = 0.05)
gene <- as.character(bitr(Total_Spe_gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID)
## 'select()' returned 1:many mapping between keys and columns
Total_Spe_gene_kk <- enrichKEGG(gene = gene,
organism = 'hsa',
pvalueCutoff = 0.05)
if(nrow(PolyA_Spe_gene_kk) > 0) dotplot(PolyA_Spe_gene_kk)
## wrong orderBy parameter; set to default `orderBy = "x"`
if(nrow(Total_Spe_gene_kk) > 0) dotplot(Total_Spe_gene_kk)
## wrong orderBy parameter; set to default `orderBy = "x"`
xx2 <- compareCluster(geneClusters = list(PolyA = PolyA_Spe_gene, Total = Total_Spe_gene),
fun="enrichGO",
OrgDb = org.Hs.eg.db,
keyType= 'ENSEMBL',
ont = "ALL",
pAdjustMethod = "BH")
dotplot(xx2)